Multiple Linear Regression Project on Medical Cost Personal Datasets - Applied Linear Regression

Astrid Wang

December 18, 2020

datasource: https://www.kaggle.com/mirichoi0218/insurance or https://gist.github.com/meperezcuello/82a9f1c1c473d6585e750ad2e3c05a41

The report paper including method discussion and reference is in a separate docs file

#installed.packages("car")
#install.packages("olsrr")
#install.packages("reshape2")
#install.packages("tidyverse")
#install.packages("caret")
#install.packages("MuMIn")
#library(kenlab)
library(MuMIn)
library(reshape2)
library(ggplot2)
library("olsrr")
## 
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
## 
##     rivers
library(car)
## Loading required package: carData
library(caret)
## Loading required package: lattice
df<- read.csv(file = 'insurance2.csv')
head(df)
df <- transform(sex=factor(sex),smoker=factor(smoker),region=factor(region),df)
f1<-lm(charges~age+sex+bmi+children+smoker+region,data=df) # base model, or full model
summary(f1)
## 
## Call:
## lm(formula = charges ~ age + sex + bmi + children + smoker + 
##     region, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11304.9  -2848.1   -982.1   1393.9  29992.8 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -11938.5      987.8 -12.086  < 2e-16 ***
## age                256.9       11.9  21.587  < 2e-16 ***
## sexmale           -131.3      332.9  -0.394 0.693348    
## bmi                339.2       28.6  11.860  < 2e-16 ***
## children           475.5      137.8   3.451 0.000577 ***
## smokeryes        23848.5      413.1  57.723  < 2e-16 ***
## regionnorthwest   -353.0      476.3  -0.741 0.458769    
## regionsoutheast  -1035.0      478.7  -2.162 0.030782 *  
## regionsouthwest   -960.0      477.9  -2.009 0.044765 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6062 on 1329 degrees of freedom
## Multiple R-squared:  0.7509, Adjusted R-squared:  0.7494 
## F-statistic: 500.8 on 8 and 1329 DF,  p-value: < 2.2e-16
contrasts(df$sex)
##        male
## female    0
## male      1
contrasts(df$smoker)
##     yes
## no    0
## yes   1
contrasts(df$region)
##           northwest southeast southwest
## northeast         0         0         0
## northwest         1         0         0
## southeast         0         1         0
## southwest         0         0         1
vif(f1)
##              GVIF Df GVIF^(1/(2*Df))
## age      1.016822  1        1.008376
## sex      1.008900  1        1.004440
## bmi      1.106630  1        1.051965
## children 1.004011  1        1.002003
## smoker   1.012074  1        1.006019
## region   1.098893  3        1.015841
plot(density(df$charges))

plot(density(log(df$charges)))

plot(f1)

table(df$children) # try to understand the dataframe
## 
##   0   1   2   3   4   5 
## 574 324 240 157  25  18
plot(df$age,df$children)

plot(log(df$age),log(df$charges))

Apply backward elimination for full model

f2<-ols_step_backward_p(f1,details = TRUE) 
## Backward Elimination Method 
## ---------------------------
## 
## Candidate Terms: 
## 
## 1 . age 
## 2 . sex 
## 3 . bmi 
## 4 . children 
## 5 . smoker 
## 6 . region 
## 
## We are eliminating variables based on p value...
## 
## - sex 
## 
## Backward Elimination: Step 1 
## 
##  Variable sex Removed 
## 
##                            Model Summary                             
## --------------------------------------------------------------------
## R                       0.867       RMSE                   6060.178 
## R-Squared               0.751       Coef. Var                45.667 
## Adj. R-Squared          0.750       MSE                36725751.333 
## Pred R-Squared          0.747       MAE                    4171.710 
## --------------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
## 
##                                        ANOVA                                        
## -----------------------------------------------------------------------------------
##                         Sum of                                                     
##                        Squares          DF        Mean Square       F         Sig. 
## -----------------------------------------------------------------------------------
## Regression         1.47229e+11           7    21032710327.911    572.697    0.0000 
## Residual       48845249272.988        1330       36725751.333                      
## Total         196074221568.368        1337                                         
## -----------------------------------------------------------------------------------
## 
##                                            Parameter Estimates                                            
## ---------------------------------------------------------------------------------------------------------
##           model          Beta    Std. Error    Std. Beta       t        Sig          lower         upper 
## ---------------------------------------------------------------------------------------------------------
##     (Intercept)    -11990.270       978.762                 -12.250    0.000    -13910.355    -10070.185 
##             age       256.974        11.891        0.298     21.610    0.000       233.646       280.301 
##             bmi       338.665        28.559        0.171     11.858    0.000       282.639       394.690 
##        children       474.566       137.740        0.047      3.445    0.001       204.355       744.778 
##       smokeryes     23836.301       411.856        0.795     57.875    0.000     23028.341     24644.260 
## regionnorthwest      -352.182       476.120       -0.012     -0.740    0.460     -1286.211       581.847 
## regionsoutheast     -1034.360       478.537       -0.038     -2.162    0.031     -1973.130       -95.590 
## regionsouthwest      -959.375       477.778       -0.034     -2.008    0.045     -1896.656       -22.094 
## ---------------------------------------------------------------------------------------------------------
## 
## 
## 
## No more variables satisfy the condition of p value = 0.3
## 
## 
## Variables Removed: 
## 
## - sex 
## 
## 
## Final Model Output 
## ------------------
## 
##                            Model Summary                             
## --------------------------------------------------------------------
## R                       0.867       RMSE                   6060.178 
## R-Squared               0.751       Coef. Var                45.667 
## Adj. R-Squared          0.750       MSE                36725751.333 
## Pred R-Squared          0.747       MAE                    4171.710 
## --------------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
## 
##                                        ANOVA                                        
## -----------------------------------------------------------------------------------
##                         Sum of                                                     
##                        Squares          DF        Mean Square       F         Sig. 
## -----------------------------------------------------------------------------------
## Regression         1.47229e+11           7    21032710327.911    572.697    0.0000 
## Residual       48845249272.988        1330       36725751.333                      
## Total         196074221568.368        1337                                         
## -----------------------------------------------------------------------------------
## 
##                                            Parameter Estimates                                            
## ---------------------------------------------------------------------------------------------------------
##           model          Beta    Std. Error    Std. Beta       t        Sig          lower         upper 
## ---------------------------------------------------------------------------------------------------------
##     (Intercept)    -11990.270       978.762                 -12.250    0.000    -13910.355    -10070.185 
##             age       256.974        11.891        0.298     21.610    0.000       233.646       280.301 
##             bmi       338.665        28.559        0.171     11.858    0.000       282.639       394.690 
##        children       474.566       137.740        0.047      3.445    0.001       204.355       744.778 
##       smokeryes     23836.301       411.856        0.795     57.875    0.000     23028.341     24644.260 
## regionnorthwest      -352.182       476.120       -0.012     -0.740    0.460     -1286.211       581.847 
## regionsoutheast     -1034.360       478.537       -0.038     -2.162    0.031     -1973.130       -95.590 
## regionsouthwest      -959.375       477.778       -0.034     -2.008    0.045     -1896.656       -22.094 
## ---------------------------------------------------------------------------------------------------------
f2<-lm(charges ~ age  + bmi + children + smoker+region, data = df)
summary(f2)
## 
## Call:
## lm(formula = charges ~ age + bmi + children + smoker + region, 
##     data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11367.2  -2835.4   -979.7   1361.9  29935.5 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -11990.27     978.76 -12.250  < 2e-16 ***
## age                256.97      11.89  21.610  < 2e-16 ***
## bmi                338.66      28.56  11.858  < 2e-16 ***
## children           474.57     137.74   3.445 0.000588 ***
## smokeryes        23836.30     411.86  57.875  < 2e-16 ***
## regionnorthwest   -352.18     476.12  -0.740 0.459618    
## regionsoutheast  -1034.36     478.54  -2.162 0.030834 *  
## regionsouthwest   -959.37     477.78  -2.008 0.044846 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6060 on 1330 degrees of freedom
## Multiple R-squared:  0.7509, Adjusted R-squared:  0.7496 
## F-statistic: 572.7 on 7 and 1330 DF,  p-value: < 2.2e-16

Transformation Attemptation

summary(powerTransform(f2)) # try power transformations but failed
## bcPower Transformation to Normality 
##    Est Power Rounded Pwr Wald Lwr Bnd Wald Upr Bnd
## Y1    0.1507        0.15       0.1049       0.1966
## 
## Likelihood ratio test that transformation parameter is equal to 0
##  (log transformation)
##                           LRT df       pval
## LR test, lambda = (0) 41.3912  1 1.2462e-10
## 
## Likelihood ratio test that no transformation is needed
##                            LRT df       pval
## LR test, lambda = (1) 1162.427  1 < 2.22e-16
invResPlot(f2)
f6<-lm(log(charges) ~ age  + bmi + children + smoker+region, data = df) # use log instead of 1/3 power for the response variable because it makes more quantiles fashion the Normal distribution
summary(f6)
## 
## Call:
## lm(formula = log(charges) ~ age + bmi + children + smoker + region, 
##     data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.10302 -0.19707 -0.05206  0.06564  2.15091 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      7.0008478  0.0719853  97.254  < 2e-16 ***
## age              0.0346490  0.0008746  39.618  < 2e-16 ***
## bmi              0.0130711  0.0021004   6.223 6.52e-10 ***
## children         0.1013204  0.0101304  10.002  < 2e-16 ***
## smokeryes        1.5472965  0.0302910  51.081  < 2e-16 ***
## regionnorthwest -0.0633386  0.0350174  -1.809 0.070712 .  
## regionsoutheast -0.1568166  0.0351952  -4.456 9.07e-06 ***
## regionsouthwest -0.1285638  0.0351393  -3.659 0.000263 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4457 on 1330 degrees of freedom
## Multiple R-squared:  0.7663, Adjusted R-squared:  0.765 
## F-statistic: 622.9 on 7 and 1330 DF,  p-value: < 2.2e-16
plot(f6)

Add Weight

df2<-suppressWarnings(transform(df, SD=apply(df,1, sd, na.rm = TRUE))) # new column sd
head(df2)
f3<-lm(log(charges) ~ age  + bmi + children + smoker+region, weights = 1/SD^2,data = df2)
summary(f3)
## 
## Call:
## lm(formula = log(charges) ~ age + bmi + children + smoker + region, 
##     data = df2, weights = 1/SD^2)
## 
## Weighted Residuals:
##        Min         1Q     Median         3Q        Max 
## -4.276e-04 -3.800e-05 -2.050e-06  5.623e-05  3.691e-04 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      6.6002913  0.0296889 222.315  < 2e-16 ***
## age              0.0524545  0.0006374  82.288  < 2e-16 ***
## bmi              0.0001969  0.0008373   0.235    0.814    
## children         0.1956888  0.0063994  30.579  < 2e-16 ***
## smokeryes        1.5565488  0.0697526  22.315  < 2e-16 ***
## regionnorthwest -0.0834179  0.0167097  -4.992 6.76e-07 ***
## regionsoutheast -0.2893256  0.0158689 -18.232  < 2e-16 ***
## regionsouthwest -0.2598225  0.0157648 -16.481  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.933e-05 on 1330 degrees of freedom
## Multiple R-squared:  0.9067, Adjusted R-squared:  0.9063 
## F-statistic:  1847 on 7 and 1330 DF,  p-value: < 2.2e-16
plot(f3) # increased normality a lot

f7<-lm((charges)^(1/3) ~ age  + bmi + children + smoker+region, weights = 1/SD^2,data = df2) # normality issue did not solved by 1/3 power transformation, therefore log transformation is a good chioce. $
summary(f7)
## 
## Call:
## lm(formula = (charges)^(1/3) ~ age + bmi + children + smoker + 
##     region, data = df2, weights = 1/SD^2)
## 
## Weighted Residuals:
##        Min         1Q     Median         3Q        Max 
## -1.248e-03 -1.457e-04  3.290e-06  2.436e-04  1.994e-03 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      7.202067   0.134281  53.634  < 2e-16 ***
## age              0.277962   0.002883  96.409  < 2e-16 ***
## bmi              0.004463   0.003787   1.178    0.239    
## children         0.891054   0.028944  30.785  < 2e-16 ***
## smokeryes       11.233184   0.315487  35.606  < 2e-16 ***
## regionnorthwest -0.411715   0.075577  -5.448 6.08e-08 ***
## regionsoutheast -1.226795   0.071774 -17.092  < 2e-16 ***
## regionsouthwest -1.140764   0.071303 -15.999  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.000404 on 1330 degrees of freedom
## Multiple R-squared:  0.9287, Adjusted R-squared:  0.9284 
## F-statistic:  2476 on 7 and 1330 DF,  p-value: < 2.2e-16
plot(f7)

pairs(~charges+age+bmi+ children ,data=df,main="Insurance Scatterplot Matrix")

f4<-lm(log(charges) ~ age  + bmi + children + smoker+region+age:children+bmi:smoker+age:bmi+age:smoker+bmi:children+children:smoker, weights = 1/SD^2,data = df2) # add all possible interaction terms
summary(f4)
## 
## Call:
## lm(formula = log(charges) ~ age + bmi + children + smoker + region + 
##     age:children + bmi:smoker + age:bmi + age:smoker + bmi:children + 
##     children:smoker, data = df2, weights = 1/SD^2)
## 
## Weighted Residuals:
##        Min         1Q     Median         3Q        Max 
## -3.845e-04 -2.971e-05 -1.600e-06  2.938e-05  3.763e-04 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         6.473e+00  6.961e-02  92.998  < 2e-16 ***
## age                 5.856e-02  3.013e-03  19.432  < 2e-16 ***
## bmi                 1.747e-04  2.183e-03   0.080   0.9362    
## children            3.662e-01  3.281e-02  11.159  < 2e-16 ***
## smokeryes           1.574e+00  3.493e-01   4.507 7.16e-06 ***
## regionnorthwest    -8.749e-02  1.513e-02  -5.782 9.18e-09 ***
## regionsoutheast    -2.888e-01  1.437e-02 -20.097  < 2e-16 ***
## regionsouthwest    -2.655e-01  1.429e-02 -18.579  < 2e-16 ***
## age:children       -7.850e-03  5.669e-04 -13.848  < 2e-16 ***
## bmi:smokeryes       5.534e-02  1.163e-02   4.760 2.15e-06 ***
## age:bmi            -1.595e-05  9.477e-05  -0.168   0.8664    
## age:smokeryes      -4.063e-02  4.978e-03  -8.162 7.59e-16 ***
## bmi:children        1.791e-03  9.861e-04   1.816   0.0696 .  
## children:smokeryes -1.115e-01  5.474e-02  -2.036   0.0419 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.086e-05 on 1324 degrees of freedom
## Multiple R-squared:  0.9239, Adjusted R-squared:  0.9232 
## F-statistic:  1237 on 13 and 1324 DF,  p-value: < 2.2e-16
plot(f4)

ols_step_backward_p(f4,details = TRUE) # apply backward elimination again
## Backward Elimination Method 
## ---------------------------
## 
## Candidate Terms: 
## 
## 1 . age 
## 2 . bmi 
## 3 . children 
## 4 . smoker 
## 5 . region 
## 6 . age:children 
## 7 . bmi:smoker 
## 8 . age:bmi 
## 9 . age:smoker 
## 10 . bmi:children 
## 11 . children:smoker 
## 
## We are eliminating variables based on p value...
## 
## - bmi:children 
## 
## Backward Elimination: Step 1 
## 
##  Variable bmi:children Removed 
## 
##                         Model Summary                         
## -------------------------------------------------------------
## R                       0.912       RMSE               0.380 
## R-Squared               0.831       Coef. Var          4.173 
## Adj. R-Squared          0.829       MSE                0.144 
## Pred R-Squared          0.828       MAE                0.207 
## -------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
## 
##                                  ANOVA                                  
## -----------------------------------------------------------------------
##                 Sum of                                                 
##                Squares          DF    Mean Square       F         Sig. 
## -----------------------------------------------------------------------
## Regression     939.444          12         78.287    543.006    0.0000 
## Residual       191.030        1325          0.144                      
## Total         1130.474        1337                                     
## -----------------------------------------------------------------------
## 
##                                       Parameter Estimates                                        
## ------------------------------------------------------------------------------------------------
##              model      Beta    Std. Error    Std. Beta       t        Sig      lower     upper 
## ------------------------------------------------------------------------------------------------
##        (Intercept)     6.838         0.161                  42.517    0.000     6.522     7.153 
##                age     0.047         0.004        0.721     12.217    0.000     0.040     0.055 
##                bmi     0.005         0.005        0.031      0.918    0.359    -0.005     0.015 
##           children     0.282         0.028        0.369     10.176    0.000     0.227     0.336 
##          smokeryes     1.412         0.145        0.620      9.746    0.000     1.128     1.696 
##    regionnorthwest    -0.057         0.030       -0.027     -1.923    0.055    -0.116     0.001 
##    regionsoutheast    -0.141         0.030       -0.068     -4.694    0.000    -0.200    -0.082 
##    regionsouthwest    -0.148         0.030       -0.069     -4.945    0.000    -0.207    -0.089 
##       age:children    -0.004         0.001       -0.218     -5.907    0.000    -0.005    -0.003 
##      bmi:smokeryes     0.050         0.004        0.688     12.027    0.000     0.042     0.058 
##            age:bmi     0.000         0.000       -0.049     -0.702    0.483     0.000     0.000 
##      age:smokeryes    -0.032         0.002       -0.591    -17.416    0.000    -0.036    -0.029 
## children:smokeryes    -0.123         0.022       -0.093     -5.581    0.000    -0.167    -0.080 
## ------------------------------------------------------------------------------------------------
## 
## 
## - age:bmi 
## 
## Backward Elimination: Step 2 
## 
##  Variable age:bmi Removed 
## 
##                         Model Summary                         
## -------------------------------------------------------------
## R                       0.912       RMSE               0.380 
## R-Squared               0.831       Coef. Var          4.172 
## Adj. R-Squared          0.830       MSE                0.144 
## Pred R-Squared          0.828       MAE                0.208 
## -------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
## 
##                                  ANOVA                                  
## -----------------------------------------------------------------------
##                 Sum of                                                 
##                Squares          DF    Mean Square       F         Sig. 
## -----------------------------------------------------------------------
## Regression     939.373          11         85.398    592.552    0.0000 
## Residual       191.101        1326          0.144                      
## Total         1130.474        1337                                     
## -----------------------------------------------------------------------
## 
##                                       Parameter Estimates                                        
## ------------------------------------------------------------------------------------------------
##              model      Beta    Std. Error    Std. Beta       t        Sig      lower     upper 
## ------------------------------------------------------------------------------------------------
##        (Intercept)     6.939         0.072                  96.763    0.000     6.798     7.080 
##                age     0.045         0.001        0.681     45.649    0.000     0.043     0.047 
##                bmi     0.001         0.002        0.009      0.696    0.487    -0.003     0.005 
##           children     0.282         0.028        0.370     10.195    0.000     0.228     0.336 
##          smokeryes     1.410         0.145        0.619      9.735    0.000     1.126     1.694 
##    regionnorthwest    -0.057         0.030       -0.027     -1.907    0.057    -0.115     0.002 
##    regionsoutheast    -0.140         0.030       -0.068     -4.671    0.000    -0.199    -0.081 
##    regionsouthwest    -0.149         0.030       -0.069     -4.960    0.000    -0.207    -0.090 
##       age:children    -0.004         0.001       -0.219     -5.928    0.000    -0.005    -0.003 
##      bmi:smokeryes     0.050         0.004        0.690     12.068    0.000     0.042     0.058 
##      age:smokeryes    -0.032         0.002       -0.592    -17.437    0.000    -0.036    -0.029 
## children:smokeryes    -0.124         0.022       -0.093     -5.585    0.000    -0.167    -0.080 
## ------------------------------------------------------------------------------------------------
## 
## 
## 
## No more variables satisfy the condition of p value = 0.3
## 
## 
## Variables Removed: 
## 
## - bmi:children 
## - age:bmi 
## 
## 
## Final Model Output 
## ------------------
## 
##                         Model Summary                         
## -------------------------------------------------------------
## R                       0.912       RMSE               0.380 
## R-Squared               0.831       Coef. Var          4.172 
## Adj. R-Squared          0.830       MSE                0.144 
## Pred R-Squared          0.828       MAE                0.208 
## -------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
## 
##                                  ANOVA                                  
## -----------------------------------------------------------------------
##                 Sum of                                                 
##                Squares          DF    Mean Square       F         Sig. 
## -----------------------------------------------------------------------
## Regression     939.373          11         85.398    592.552    0.0000 
## Residual       191.101        1326          0.144                      
## Total         1130.474        1337                                     
## -----------------------------------------------------------------------
## 
##                                       Parameter Estimates                                        
## ------------------------------------------------------------------------------------------------
##              model      Beta    Std. Error    Std. Beta       t        Sig      lower     upper 
## ------------------------------------------------------------------------------------------------
##        (Intercept)     6.939         0.072                  96.763    0.000     6.798     7.080 
##                age     0.045         0.001        0.681     45.649    0.000     0.043     0.047 
##                bmi     0.001         0.002        0.009      0.696    0.487    -0.003     0.005 
##           children     0.282         0.028        0.370     10.195    0.000     0.228     0.336 
##          smokeryes     1.410         0.145        0.619      9.735    0.000     1.126     1.694 
##    regionnorthwest    -0.057         0.030       -0.027     -1.907    0.057    -0.115     0.002 
##    regionsoutheast    -0.140         0.030       -0.068     -4.671    0.000    -0.199    -0.081 
##    regionsouthwest    -0.149         0.030       -0.069     -4.960    0.000    -0.207    -0.090 
##       age:children    -0.004         0.001       -0.219     -5.928    0.000    -0.005    -0.003 
##      bmi:smokeryes     0.050         0.004        0.690     12.068    0.000     0.042     0.058 
##      age:smokeryes    -0.032         0.002       -0.592    -17.437    0.000    -0.036    -0.029 
## children:smokeryes    -0.124         0.022       -0.093     -5.585    0.000    -0.167    -0.080 
## ------------------------------------------------------------------------------------------------
## 
## 
##                                   Elimination Summary                                   
## ---------------------------------------------------------------------------------------
##         Variable                      Adj.                                                 
## Step      Removed       R-Square    R-Square          C(p)             AIC        RMSE     
## ---------------------------------------------------------------------------------------
##    1    bmi:children       0.831      0.8295    29218352442.9501    1220.6601    0.3797    
##    2    age:bmi            0.831      0.8296    29229226142.6719    1219.1580    0.3796    
## ---------------------------------------------------------------------------------------
f5<-lm(log(charges) ~ age  + bmi + children + smoker+region+age:children+bmi:smoker+age:smoker+children:smoker, weights = 1/SD^2,data = df2)
summary(f5) # final model, the good one
## 
## Call:
## lm(formula = log(charges) ~ age + bmi + children + smoker + region + 
##     age:children + bmi:smoker + age:smoker + children:smoker, 
##     data = df2, weights = 1/SD^2)
## 
## Weighted Residuals:
##        Min         1Q     Median         3Q        Max 
## -3.754e-04 -2.966e-05 -1.020e-06  2.947e-05  3.765e-04 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         6.4627945  0.0286551 225.537  < 2e-16 ***
## age                 0.0580826  0.0006916  83.985  < 2e-16 ***
## bmi                 0.0005078  0.0007608   0.667   0.5046    
## children            0.4170594  0.0170953  24.396  < 2e-16 ***
## smokeryes           1.5416300  0.3487738   4.420 1.07e-05 ***
## regionnorthwest    -0.0874500  0.0151389  -5.777 9.49e-09 ***
## regionsoutheast    -0.2893022  0.0143764 -20.123  < 2e-16 ***
## regionsouthwest    -0.2646088  0.0142848 -18.524  < 2e-16 ***
## age:children       -0.0077483  0.0005646 -13.724  < 2e-16 ***
## bmi:smokeryes       0.0566649  0.0115877   4.890 1.13e-06 ***
## age:smokeryes      -0.0406418  0.0049718  -8.175 6.87e-16 ***
## children:smokeryes -0.1185118  0.0546348  -2.169   0.0302 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.091e-05 on 1326 degrees of freedom
## Multiple R-squared:  0.9237, Adjusted R-squared:  0.9231 
## F-statistic:  1460 on 11 and 1326 DF,  p-value: < 2.2e-16
residualPlot(f5)

plot(f5)

## Questions and Findings 0. descriptive stat

summary(df)
##       age            sex           bmi           children     smoker    
##  Min.   :18.00   female:662   Min.   :15.96   Min.   :0.000   no :1064  
##  1st Qu.:27.00   male  :676   1st Qu.:26.30   1st Qu.:0.000   yes: 274  
##  Median :39.00                Median :30.40   Median :1.000             
##  Mean   :39.21                Mean   :30.66   Mean   :1.095             
##  3rd Qu.:51.00                3rd Qu.:34.69   3rd Qu.:2.000             
##  Max.   :64.00                Max.   :53.13   Max.   :5.000             
##        region       charges     
##  northeast:324   Min.   : 1122  
##  northwest:325   1st Qu.: 4740  
##  southeast:364   Median : 9382  
##  southwest:325   Mean   :13270  
##                  3rd Qu.:16640  
##                  Max.   :63770
  1. correlation matrix heatmap
#head(df2)
df2$sex <- ifelse(df2$sex=="male", 1, 0)
df2$smoker <- ifelse(df2$smoker=="yes", 1, 0)
head(df2)
df3 <- df2[, c(1,2,3,4,5,7)]
head(df3)
cormat <- round(cor(df3),2)
head(cormat)
##            age   sex  bmi children smoker charges
## age       1.00 -0.02 0.11     0.04  -0.03    0.30
## sex      -0.02  1.00 0.05     0.02   0.08    0.06
## bmi       0.11  0.05 1.00     0.01   0.00    0.20
## children  0.04  0.02 0.01     1.00   0.01    0.07
## smoker   -0.03  0.08 0.00     0.01   1.00    0.79
## charges   0.30  0.06 0.20     0.07   0.79    1.00
melted_cormat <- melt(cormat)
head(melted_cormat)
ggplot(data = melted_cormat, aes(x=Var1, y=Var2, fill=value)) + 
  geom_tile()

get_lower_tri<-function(cormat){
    cormat[upper.tri(cormat)] <- NA
    return(cormat)
}
get_upper_tri <- function(cormat){
    cormat[lower.tri(cormat)]<- NA
    return(cormat)
}
upper_tri <- get_upper_tri(cormat)
upper_tri
##          age   sex  bmi children smoker charges
## age        1 -0.02 0.11     0.04  -0.03    0.30
## sex       NA  1.00 0.05     0.02   0.08    0.06
## bmi       NA    NA 1.00     0.01   0.00    0.20
## children  NA    NA   NA     1.00   0.01    0.07
## smoker    NA    NA   NA       NA   1.00    0.79
## charges   NA    NA   NA       NA     NA    1.00
melted_cormat <- melt(upper_tri, na.rm = TRUE)
ggplot(data = melted_cormat, aes(Var2, Var1, fill = value))+
 geom_tile(color = "white")+
 scale_fill_gradient2(low = "red", high = "blue", mid = "pink", 
   midpoint = 0, limit = c(-1,1), space = "Lab", 
   name="Correlation Heatmap") +
  theme_minimal()+ 
 theme(axis.text.x = element_text(angle = 45, vjust = 1, 
    size = 12, hjust = 1))+
 coord_fixed()

  1. box plot of smokers, regions
boxplot(charges~smoker,
data=df,
main="Charges for Non-Smokers vs Smokers",
xlab="Smoking Habit",
ylab="Medical Insurance Cost",
col="orange",
border="brown"
)

boxplot(charges~region,
data=df,
main="Charges for US Regions",
xlab="US Regions",
ylab="Medical Insurance Cost",
col="orange",
border="brown"
)

  1. what is the model? is there a positive linear relationship between log(charges) and age? CI? H0: B1=0, H1:B1>0. p-value < 2e-16 which is < 0.05 therefore reject H0, yes.
confint(f5,"age",level=0.95)
##          2.5 %     97.5 %
## age 0.05672583 0.05943928

with smoking?

exp(1.5416300) # a smoker will spend $4.67 more than a non-smoker. 
## [1] 4.6722

with children?

confint(f5,"children",level=0.95) # and yes
##              2.5 %    97.5 %
## children 0.3835226 0.4505962

bmi?

confint(f5,"bmi",level=0.95) #No, not in the final model
##             2.5 %      97.5 %
## bmi -0.0009847388 0.002000344
  1. what expenses would you forecast, for a 22-year-old female non-smoker comes from northwest region with 0 children and 17.9 bmi? What is the 95% prediction interval?
df4 <- data.frame(smoker = "no", children = 0, age = 22,bmi=17.9,region="northwest")
p1<-predict(f5, df4, interval = 'prediction')
## Warning in predict.lm(f5, df4, interval = "prediction"): Assuming constant prediction variance even though model fit is weighted
p1
##       fit     lwr      upr
## 1 7.66225 7.63622 7.688281
exp(p1)
##        fit      lwr     upr
## 1 2126.537 2071.896 2182.62

Model Selection table

not fitted in the same dataframe because I added a new column for the weight but the predictors did not change.

model.sel(f1, f2, f3, f4, f5, f6, f7, rank = AIC)
## Warning in model.sel.default(f1, f2, f3, f4, f5, f6, f7, rank = AIC): response
## differs between models
## Warning in model.sel.default(f1, f2, f3, f4, f5, f6, f7, rank = AIC): models are
## not all fitted to the same data

ANOVA table for f5 which is the final model

#parameterEstimates(f5)
anova(f5)
summary(f5)
## 
## Call:
## lm(formula = log(charges) ~ age + bmi + children + smoker + region + 
##     age:children + bmi:smoker + age:smoker + children:smoker, 
##     data = df2, weights = 1/SD^2)
## 
## Weighted Residuals:
##        Min         1Q     Median         3Q        Max 
## -3.754e-04 -2.966e-05 -1.020e-06  2.947e-05  3.765e-04 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         6.4627945  0.0286551 225.537  < 2e-16 ***
## age                 0.0580826  0.0006916  83.985  < 2e-16 ***
## bmi                 0.0005078  0.0007608   0.667   0.5046    
## children            0.4170594  0.0170953  24.396  < 2e-16 ***
## smokeryes           1.5416300  0.3487738   4.420 1.07e-05 ***
## regionnorthwest    -0.0874500  0.0151389  -5.777 9.49e-09 ***
## regionsoutheast    -0.2893022  0.0143764 -20.123  < 2e-16 ***
## regionsouthwest    -0.2646088  0.0142848 -18.524  < 2e-16 ***
## age:children       -0.0077483  0.0005646 -13.724  < 2e-16 ***
## bmi:smokeryes       0.0566649  0.0115877   4.890 1.13e-06 ***
## age:smokeryes      -0.0406418  0.0049718  -8.175 6.87e-16 ***
## children:smokeryes -0.1185118  0.0546348  -2.169   0.0302 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.091e-05 on 1326 degrees of freedom
## Multiple R-squared:  0.9237, Adjusted R-squared:  0.9231 
## F-statistic:  1460 on 11 and 1326 DF,  p-value: < 2.2e-16